Entity Resolution for Deduplication of data

A new post by hswerdfe

Howard Swerdfeger https://hswerdfe.github.io/docs/ (Data DoDo)https://hswerdfe.github.io/docs/
2021-04-09

Motivation

Data quality sucks, and we have to match records togeather.

Load Libraries

library(knitr)
library(tidyverse)
library(janitor)
library(lubridate)
library(fuzzyjoin) # for common mispellings
#library(maps) # for city Names, but I don't want to override purrr:map
library(babynames) # for first names
library(lexicon) # for Last names
library(reclin) # for de-duplication
library(phonics) # for soundex
library(plotROC) # for ROC curves
library(AUC)# for AUC calculations curves
library(magrittr) #extract2
library(igraph) # Neigbourhood determination
library(lemon)
library(snakecase)
theme_set(theme_minimal())
knit_print.data.frame <- lemon_print

set.seed(as.integer(as.Date("2021-04-09")))

Generate Random Dataset

g_num_entities = 72
g_num_dup_max = 12
g_prob_error = 0.05
g_prob_miss = 0.05

Generate a dataset with:

Some Utility Functions

#'
#' get a single character out of a string
#'
#' s2c(i = 1, "data Doo Doo")
s2c <- function(i, str, ...){
  substr(x = str, start = i,stop =  i, ...)
}

#'
#' change a string to a vector 
#'
#'example:
#'  map_s2c("data Doo Doo")
map_s2c<- function(str, ... ){
  purrr::map(1:nchar(str), s2c, str = str, ...) %>% unlist()
}


#'
#' Randomly edit some string
#'
#' random_edit("billy bob thorton")
random_edit <- function(str, prob_char = g_prob_error, sub_in = letters){
  if_else(
    sample(x = c(T, F), size = nchar(str), prob = c(prob_char,1-prob_char), replace = TRUE),
    sample(x = sub_in, size =  nchar(str)),
    map_s2c(str)
  ) %>% paste0(collapse = "")
}



#'
#' Generate a base set of entities 
#'
#' generate_entities(10)
generate_entities <- function(num_entities = g_num_entities, max_rep = g_num_dup_max){

  dts_for_day_of_year = sample(seq(as.Date("1900-01-01"), as.Date("1901-12-31"),by = "day"), size = num_entities, replace = TRUE)

    tibble(
      first_name = 
        babynames::babynames %>% 
        group_by(name) %>% summarise(n = sum(n, na.rm = TRUE)) %>% 
        sample_n(size = num_entities, replace = TRUE, weight = n) %>% 
        pull(name),
      middle_name = 
        babynames::babynames %>% 
        group_by(name) %>% summarise(n = sum(n, na.rm = TRUE)) %>% 
        sample_n(size = num_entities, replace = TRUE, weight = n) %>% 
        pull(name) ,     
      last_name = 
        lexicon::freq_last_names %>% 
        clean_names() %>%
        sample_n(size = num_entities, replace = TRUE, weight = prop) %>%
        pull(surname),
      city = 
        maps::world.cities %>% 
        clean_names() %>%
        filter(country_etc =="USA") %>% 
        sample_n(size = num_entities, replace = TRUE, weight = pop) %>% pull(name), 
      dob_year = 
        babynames::babynames  %>% 
        group_by(year) %>% 
        summarise(n = sum(n, na.rm = TRUE)) %>%
        sample_n(size = num_entities, replace = TRUE, weight = n) %>% 
        pull(year),
      dob_month = month(dts_for_day_of_year),
      dob_day = day(dts_for_day_of_year)
    ) %>% 
    mutate(dob = ISOdate(dob_year, dob_month, dob_day)) %>% 
    #select(-dob_year, -dob_month, -dob_day) %>% 
    mutate(., key = 1:nrow(.)) %>%
    mutate(n = sample(1:max_rep, size = num_entities, replace = TRUE))  
  
}

Create Sample dataset

In this Dataset every record has a name a city and a date of birth.

Additionally it has a key we will use that later to figure out which record is which.

We also generate n which is how many times we will duplicate that record and make random edits to simulate noise.

entities <- generate_entities()
entities %>% sample_n(7)
# A tibble: 7 x 10
  first_name middle_name last_name city     dob_year dob_month dob_day
  <chr>      <chr>       <chr>     <chr>       <dbl>     <dbl>   <int>
1 Patrick    Veva        Mcdonald  New York     1959        10      26
2 Gemma      Ricki       Padilla   Houston      1989         6      10
3 Joshua     Kenneth     Anderson  Los Ang~     2016        12      21
4 Alyssa     Lillian     James     San Jose     1991         6      24
5 Sherri     Jaret       Jackson   Tucson       1943         7      29
6 Charles    Roger       Jensen    Los Ang~     1950        10      17
7 Thiago     Lindsay     Johnson   Charles~     2007         5       7
# ... with 3 more variables: dob <dttm>, key <int>, n <int>

Noisy Data

Make the data noisy, by making random edits to 0.05 and setting 0.05 of random cells to NA. notice how the names clearly don’t look as clean.

noisy <- 
  entities %>% 
  uncount(weights = n) %>% 
  #select(-key) %>% 
  #head(5000) %>% 
  mutate(first_name = purrr::map(first_name, random_edit) %>% unlist()) %>%
  mutate(middle_name = purrr::map(middle_name, random_edit) %>% unlist()) %>%  
  mutate(last_name = purrr::map(last_name, random_edit) %>% unlist()) %>%
  mutate(city = purrr::map(city, random_edit) %>% unlist()) %>% 
  mutate(dob_year = purrr::map(dob_year, random_edit, sub_in = as.character(0:9)) %>% unlist() %>% as.integer()) %>%
  mutate(dob_month = purrr::map(dob_month, random_edit, sub_in = as.character(1:12)) %>% unlist() %>% as.integer()) %>%
  mutate(dob_day = purrr::map(dob_day, random_edit, sub_in = as.character(1:31)) %>% unlist() %>% as.integer()) %>%
  mutate(dob = ISOdate(dob_year, dob_month, dob_day))   %>%
  select(-dob_year, -dob_month, -dob_day) %>%
  mutate(.,first_name = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
                               as.character(NA), first_name)) %>%
  mutate(.,last_name = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
                                 as.character(NA), last_name)) %>%  
  mutate(.,city = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
                                as.character(NA), city)) %>%  
   mutate(.,dob = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
                          ISOdate(99, 99, 99), dob)) 

noisy %>% sample_n(7)
# A tibble: 7 x 6
  first_name middle_name last_name city      dob                   key
  <chr>      <chr>       <chr>     <chr>     <dttm>              <int>
1 Darlene    Doris       Jawssen   New Brau~ 1950-06-15 12:00:00    68
2 Carolyn    Samuel      Agustin   Newton    1943-06-06 12:00:00     6
3 Jeremy     Hayden      Daniels   Schenect~ 1952-11-04 12:00:00    34
4 Pclly      Mckexzie    randy     Riverside 1970-03-01 12:00:00     9
5 Sherki     Jarkt       eackson   Tucson    1943-07-29 12:00:00    31
6 Alyssa     Lillian     James     San Jose  1991-06-24 12:00:00    70
7 Jamie      Ariana      Wriyht    <NA>      1998-01-04 12:00:00    16

Look at duplicates of one key

key_of_interest <-
  noisy %>%
  count(key, sort = T) %>%
  slice(as.integer(g_num_entities/2)) %>%
  pull(key)

noisy %>%
  filter(key == key_of_interest)
# A tibble: 6 x 6
  first_name middle_name last_name city      dob                   key
  <chr>      <chr>       <chr>     <chr>     <dttm>              <int>
1 Txtry      vanzel      Waluh     Jackoonv~ 1974-02-05 12:00:00    32
2 Terry      Daniel      Waxsh     Jacksonv~ 1974-02-05 12:00:00    32
3 qerry      Daniel      Walsh     Jadzsmnv~ 1974-02-05 12:00:00    32
4 Terry      faniel      Walsk     Japzsonv~ 1974-02-05 12:00:00    32
5 Terry      Daninl      Walsh     Jacksonv~ 1974-03-05 12:00:00    32
6 Txrry      Danieg      <NA>      Jacksonv~ 1974-02-05 12:00:00    32

Each column will give us some indication that it is the same person, but each column by it self will not be enough to tell us, so we will look at multiple columns. There are various metrics we can look at

Add soundex

noisy <- 
  noisy %>% 
  select(-key, -dob) %>% 
  mutate_all(phonics::soundex) %>% 
  rename_all(~paste0(.x, "_soundex")) %>% 
  bind_cols(noisy, .)

noisy %>% sample_n(7)
# A tibble: 7 x 10
  first_name middle_name last_name city      dob                   key
  <chr>      <chr>       <chr>     <chr>     <dttm>              <int>
1 Patricia   Serena      Dunn      Saint Pe~ 1963-08-13 12:00:00    65
2 Tom        Caitlin     Brooks    Enid      1951-03-04 12:00:00    11
3 Hattie     Cameron     Regmlado  Fairfield 2007-10-11 12:00:00     2
4 Raymond    Margaret    Mason     Huntingt~ 1620-07-30 12:00:00    30
5 mubrles    Royer       Jensen    Los Ange~ 1950-10-17 12:00:00    20
6 Jonathan   nary        Caron     Houston   2008-12-16 12:00:00    47
7 Carolyn    Samuwl      Agustin   Newton    1943-06-06 12:00:00     6
# ... with 4 more variables: first_name_soundex <chr>,
#   middle_name_soundex <chr>, last_name_soundex <chr>,
#   city_soundex <chr>

Pairs of values

#columns to do comparisons on
cols_4_simmilarity <- noisy %>% select(-key) %>% colnames()
cols_4_simmilarity
[1] "first_name"          "middle_name"         "last_name"          
[4] "city"                "dob"                 "first_name_soundex" 
[7] "middle_name_soundex" "last_name_soundex"   "city_soundex"       

blocking

blocking for any one variable is the same

Blocking is how you limit the computational complexity of what you are doing, here I am blocking on any one variable being identical, which seems reasonable as every column could be wrong with some small probability.

  #
  # blocking pairs with at least on cell, any cell identical
  #
p1 <- 
  noisy %>% 
  select(-key) %>% 
  colnames() %>% 
  map_dfr(~pair_blocking(noisy, noisy, blocking_var=.x, large = F))



#
# pairs with no blocking (this is much longer)
#
pa <- pair_blocking(x= noisy, y = noisy, large = F)


#moving forward with any one column being same for computational reasons
p <- pa#p1
p
Simple blocking
  No blocking used.
  First data set:  429 records
  Second data set: 429 records
  Total number of pairs: 184 041 pairs

Showing first 20 pairs:
    x y
1   1 1
2   2 1
3   3 1
4   4 1
5   5 1
6   6 1
7   7 1
8   8 1
9   9 1
10 10 1
11 11 1
12 12 1
13 13 1
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1

Above p1 has 46271 rows, while pa has 184041 rows. this is on 429 records and 72 people, and this discrepency gets larger as our number of records get larger. Here I am using Pa and p1 for potential pairs

#'
#' Custom Date compare function
#'
date_comp <- function(date_part){
    function(x, y) {
        if (!missing(y)) {
          if (date_part == "m"){
            x %>% month() == y %>% month()
          }else if(date_part == "d"){
            x %>% day() == y %>% day()
          }else if(date_part == "y"){
            abs(x %>% year() - y %>% year())
          }else{
            0
          }
        }
        else {
            (x > threshold) & !is.na(x)
        }
    }
}
#date_comp

similarty metrics

We now have pairs to compare we look at how close individual cells in matching pairs are to each other, to do this we generate some comparisons. So here we generate 4 separate ways of comparing the strings across 3 different models. All the models will share soundex as a potential matching method. then each model will specialize in either jaro winkler, Longest Common Substring, or Jaccard simmilarity.

#########################
#soundex columns should be identical
func_list_by_col <- 
  list(first_name_soundex = identical(), 
       middle_name_soundex = identical(),
       last_name_soundex = identical(), 
       city_soundex = identical())

########################
# The library does not seem to support multiple comparitors on the same column at the same time so we do this, for the next best thing
default_comparator_list <- 
  list(jw = jaro_winkler(), 
       lcs = lcs(),
       jaccard = jaccard()
      )

############################
# generate similarities for all the matching pairs
p_s <- 
  default_comparator_list %>%
  map(function(func){
    compare_pairs(pairs = p, 
                  by = cols_4_simmilarity,
                  comparators = func_list_by_col,
                  default_comparator = func)
  })
names(p_s) <- names(default_comparator_list)

Observing the simmilarities

Here we sample from the pairs with similarities p_s to see that we now have many numerical values indicating how close each x y pair is along various metrics, higher numbers in general mean more likely to be a match as does TRUE in the binary columns.

ps_combined <- p_s %>% bind_rows(.id = 'model_name') 
ps_combined %>% sample_n(5) %>% as_tibble()
# A tibble: 5 x 12
  model_name     x     y first_name middle_name last_name  city    dob
  <chr>      <int> <int>      <dbl>       <dbl>     <dbl> <dbl>  <dbl>
1 jw           323   104      0.483       0.421    NA     0.519  0.826
2 jaccard       77   296      0           0         0     0     NA    
3 jw           130   115      0.442       0.556     0.625 0.448  0.782
4 lcs          399   377      0.333       0.75     NA     0.308 NA    
5 jw           281    28      0           0.448     0     0.423  0.815
# ... with 4 more variables: first_name_soundex <lgl>,
#   middle_name_soundex <lgl>, last_name_soundex <lgl>,
#   city_soundex <lgl>

True Pairs

Summation (simplest possible)

From here we try to go from potential pairs to true pairs. The easiest thing to do is sum across all the numeric columns, and say that higher number as more likely to be a match.

###############################
# add similarities
p_s <- 
  p_s %>% 
  map(score_simsum, var = "simpl_sum")

ps_combined <- p_s %>% bind_rows(.id = 'model_name') 
ps_combined %>%
  ggplot(aes(x = simpl_sum, color = model_name)) + 
  geom_density() +
  labs(color = "Model", x = "Total Simmilarity Score", y = "Density", title = "Density of total scores of potential pairs by model")

# default_comparator_list <- 
#   list(jw = jaro_winkler(), 
#        lcs = lcs(),
#        jaccard = jaccard()
#       )
# p_s <- 
#   default_comparator_list %>%
#   map(function(func){
#     compare_pairs(pairs = p, 
#                   by = cols_4_simmilarity,
#                   comparators = func_list_by_col,
#                   default_comparator = func)
#   })
# names(p_s) <- names(default_comparator_list)

EM algorithm

The EM Algorithm from the 2007 book Data Quality and Record Linkage Techniques generates the probability that each column contains information about weather the record pair is or is not a match

em_s <- 
  p_s %>% 
  map(problink_em)
em_s 
$jw
M- and u-probabilities estimated by the EM-algorithm:
            Variable M-probability U-probability
          first_name     0.4261385  1.239424e-09
         middle_name     0.6986917  4.067077e-09
           last_name     0.4094873  9.044788e-16
                city     0.4096737  3.182302e-03
                 dob     0.4764895  1.187817e-03
  first_name_soundex     0.4841342  4.286314e-03
 middle_name_soundex     0.7627071  8.458971e-04
   last_name_soundex     0.4743486  9.135294e-04
        city_soundex     0.3774117  5.705523e-04

Matching probability: 0.02480617.

$lcs
M- and u-probabilities estimated by the EM-algorithm:
            Variable M-probability U-probability
          first_name     0.6758350  6.074621e-06
         middle_name     0.7881897  7.229890e-03
           last_name     0.7160520  2.463114e-14
                city     0.7615701  7.337231e-03
                 dob     0.7425143  5.190125e-03
  first_name_soundex     0.6031999  4.284000e-03
 middle_name_soundex     0.7067278  5.810422e-03
   last_name_soundex     0.5981588  7.827820e-04
        city_soundex     0.4508693  9.752591e-04

Matching probability: 0.01989889.

$jaccard
M- and u-probabilities estimated by the EM-algorithm:
            Variable M-probability U-probability
          first_name     0.4363094  1.486481e-09
         middle_name     0.6933837  1.558471e-08
           last_name     0.4192609  7.779157e-16
                city     0.4171563  3.170574e-03
                 dob     0.4705465  5.812155e-04
  first_name_soundex     0.4952393  4.294945e-03
 middle_name_soundex     0.7576207  1.423543e-03
   last_name_soundex     0.4852706  9.229072e-04
        city_soundex     0.3868390  5.598046e-04

Matching probability: 0.02422578.
p_s <-
  p_s %>% 
  map(function(p_i){
    score_problink(p_i, var = "em_weight")
  })    


ps_combined <- p_s %>% bind_rows(.id = 'model_name') 
ps_combined %>%
  ggplot(aes(x = em_weight, color = model_name)) + 
  geom_density() +
  labs(color = "Model", x = "EM Weight Score", y = "Density", title = "Density of total scores of potential pairs by model")

results and metrics

Generate a results the is_same column tells us if they are truely from the same person.

p_results <- 
  map2_dfr(names(p_s), p_s, function(nm, p){
    p$def_func = nm
    p %>% as_tibble()
  }) 

noisy_kr <- 
  noisy %>% 
  mutate(., row = 1:nrow(.)) %>% 
  select("key", "row")

p_results <- 
  p_results %>% 
  left_join(noisy_kr %>% set_names(c("key_x", "x")), by = "x") %>% 
  left_join(noisy_kr %>% set_names(c("key_y", "y")), by = "y")  %>% 
  mutate(is_same = key_x == key_y)
p_results %>% summary()
       x             y         first_name     middle_name    
 Min.   :  1   Min.   :  1   Min.   :0.00    Min.   :0.0000  
 1st Qu.:108   1st Qu.:108   1st Qu.:0.00    1st Qu.:0.0000  
 Median :215   Median :215   Median :0.15    Median :0.1667  
 Mean   :215   Mean   :215   Mean   :0.21    Mean   :0.2191  
 3rd Qu.:322   3rd Qu.:322   3rd Qu.:0.43    3rd Qu.:0.4365  
 Max.   :429   Max.   :429   Max.   :1.00    Max.   :1.0000  
                             NA's   :60048                   
   last_name          city            dob         first_name_soundex
 Min.   :0.00    Min.   :0.00    Min.   :0.11     Mode :logical     
 1st Qu.:0.00    1st Qu.:0.00    1st Qu.:0.40     FALSE:483138      
 Median :0.15    Median :0.18    Median :0.56     TRUE :8937        
 Mean   :0.21    Mean   :0.23    Mean   :0.58     NA's :60048       
 3rd Qu.:0.43    3rd Qu.:0.42    3rd Qu.:0.78                       
 Max.   :1.00    Max.   :1.00    Max.   :1.00                       
 NA's   :55176   NA's   :55176   NA's   :109755                     
 middle_name_soundex last_name_soundex city_soundex   
 Mode :logical       Mode :logical     Mode :logical  
 FALSE:541224        FALSE:489960      FALSE:245088   
 TRUE :10899         TRUE :6987        TRUE :5475     
                     NA's :55176       NA's :301560   
                                                      
                                                      
                                                      
   simpl_sum        em_weight         def_func        
 Min.   :0.0000   Min.   : -9.408   Length:552123     
 1st Qu.:0.4913   1st Qu.: -5.990   Class :character  
 Median :1.2121   Median : -5.070   Mode  :character  
 Mean   :1.3285   Mean   : -3.373                     
 3rd Qu.:1.8183   3rd Qu.: -3.085                     
 Max.   :9.0000   Max.   :107.512                     
                                                      
     key_x           key_y        is_same       
 Min.   : 1.00   Min.   : 1.00   Mode :logical  
 1st Qu.:16.00   1st Qu.:16.00   FALSE:541530   
 Median :35.00   Median :35.00   TRUE :10593    
 Mean   :34.66   Mean   :34.66                  
 3rd Qu.:51.00   3rd Qu.:51.00                  
 Max.   :72.00   Max.   :72.00                  
                                                
p_results
# A tibble: 552,123 x 17
       x     y first_name middle_name last_name  city    dob
   <int> <int>      <dbl>       <dbl>     <dbl> <dbl>  <dbl>
 1     1     1      1           1         1     1      1    
 2     2     1      1           0.778     0.867 0.917 NA    
 3     3     1      1           0.889     0.867 1      1    
 4     4     1      1           0.889     0.867 1      1    
 5     5     1      1           0.778     0.867 1      0.919
 6     6     1      1           0.889     0.867 1      1    
 7     7     1      0.889       0.889     0.867 0.917  0.965
 8     8     1      0.889       0.889     0.867 0.833  1    
 9     9     1      1           0.889     0.733 0.917  1    
10    10     1      1           0.889     0.867 1      1    
# ... with 552,113 more rows, and 10 more variables:
#   first_name_soundex <lgl>, middle_name_soundex <lgl>,
#   last_name_soundex <lgl>, city_soundex <lgl>, simpl_sum <dbl>,
#   em_weight <dbl>, def_func <chr>, key_x <int>, key_y <int>,
#   is_same <lgl>
p_results %>% 
  select(-x, -y) %>% 
  mutate_if(is.logical, as.double) %>% 
  pivot_longer(!def_func) %>%
  mutate(def_func = if_else(grepl("soundex",name), "soundex", def_func)) %>% 
  mutate(name = if_else(grepl("soundex",name), gsub("_soundex", "", name), name)) %>% 
  filter(!name %in% c('key_x','key_y','is_same')) %>% 
  #mutate(name = snakecase::to_title_case(name)) %>% 
  ggplot(aes(x = value, color = def_func)) + 
  geom_density() + 
#  facet_grid(cols = vars(name), rows = vars(def_func),  scales = "free")
  facet_wrap(~ name, scales = "free") + 
  labs(color = "model", title = "similarity metrics distribution across each column and each model")

p_results %>% 
  select(simpl_sum, em_weight, def_func, is_same) %>%
  pivot_longer(., cols=c("simpl_sum", "em_weight")) %>% 
  mutate(mdl_nm = paste0(def_func, "_", name )) 
# A tibble: 1,104,246 x 5
   def_func is_same name       value mdl_nm      
   <chr>    <lgl>   <chr>      <dbl> <chr>       
 1 jw       TRUE    simpl_sum   8    jw_simpl_sum
 2 jw       TRUE    em_weight 101.   jw_em_weight
 3 jw       TRUE    simpl_sum   4.56 jw_simpl_sum
 4 jw       TRUE    em_weight  26.1  jw_em_weight
 5 jw       TRUE    simpl_sum   6.76 jw_simpl_sum
 6 jw       TRUE    em_weight  44.3  jw_em_weight
 7 jw       TRUE    simpl_sum   6.76 jw_simpl_sum
 8 jw       TRUE    em_weight  44.3  jw_em_weight
 9 jw       TRUE    simpl_sum   5.56 jw_simpl_sum
10 jw       TRUE    em_weight  31.1  jw_em_weight
# ... with 1,104,236 more rows

Look at Model Results

Looking at the model results from the ROC perspective I can say, that clearly I did not make this problem hard enough, and well I am evaluating on my training set. Moving one

p_results %>% 
  select(simpl_sum, em_weight, def_func, is_same) %>%
  pivot_longer(., cols=c("simpl_sum", "em_weight")) %>% 
  mutate(mdl_nm = paste0(def_func, "_", name )) %>%
  group_by(mdl_nm) %>% 
  mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>% 
  ungroup() %>% 
  mutate(mdl_nm = paste0("(", round(auc, 5), ") ", mdl_nm )) %>%
  mutate(mdl_nm = fct_reorder(mdl_nm, auc)) %>%
  ggplot(aes(d = is_same, m = value, color = mdl_nm)) + 
  geom_roc(n.cuts = 10) + 
  style_roc() + 
  labs()

###Select Best Model

p_results %>% 
  select(simpl_sum, em_weight, def_func, is_same) %>%
  pivot_longer(., cols=c("simpl_sum", "em_weight")) %>% 
  mutate(mdl_nm = paste0(def_func, "_", name )) %>%
  group_by(mdl_nm) %>% 
  mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>% 
  ungroup() %>% 
  mutate(mdl_nm = paste0("(", auc, ") ", mdl_nm )) %>%
  mutate(mdl_nm = fct_reorder(mdl_nm, auc)) %>%
  filter(auc == max(auc)) %>%
  ggplot(aes(d = is_same, m = value, color = mdl_nm)) + 
  geom_roc(n.cuts = 20) + 
  style_roc() + 
  labs()

best_model_results <- 
  p_results %>% 
  select(simpl_sum, em_weight, def_func, is_same, x, y) %>%
  pivot_longer(., cols=c("simpl_sum", "em_weight")) %>% 
  mutate(mdl_nm = paste0(def_func, "_", name )) %>%
  group_by(mdl_nm) %>% 
  mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>% 
  ungroup() %>% 
  filter(auc == max(auc)) 
best_model_results
# A tibble: 184,041 x 8
   def_func is_same     x     y name      value mdl_nm          auc
   <chr>    <lgl>   <int> <int> <chr>     <dbl> <chr>         <dbl>
 1 lcs      TRUE        1     1 em_weight  73.3 lcs_em_weight  1.00
 2 lcs      TRUE        2     1 em_weight  17.8 lcs_em_weight  1.00
 3 lcs      TRUE        3     1 em_weight  32.6 lcs_em_weight  1.00
 4 lcs      TRUE        4     1 em_weight  32.6 lcs_em_weight  1.00
 5 lcs      TRUE        5     1 em_weight  22.7 lcs_em_weight  1.00
 6 lcs      TRUE        6     1 em_weight  32.6 lcs_em_weight  1.00
 7 lcs      TRUE        7     1 em_weight  10.4 lcs_em_weight  1.00
 8 lcs      TRUE        8     1 em_weight  12.7 lcs_em_weight  1.00
 9 lcs      TRUE        9     1 em_weight  28.8 lcs_em_weight  1.00
10 lcs      TRUE       10     1 em_weight  32.6 lcs_em_weight  1.00
# ... with 184,031 more rows

Another way of looking at the data
cut_quantiles <- function(x, by = 0.02) {
  cut(x, breaks=unique(c(quantile(x, probs = seq(0, 1, by = by)))), 
      #labels=c("0-20","20-40","40-60","60-80","80-100"), 
      include.lowest=TRUE)
}
best_model_results %>%
  mutate(value_cut = cut_quantiles(value)) %>% 
  group_by(value_cut, is_same) %>%
  summarise(n = n()) %>% ungroup() %>%
  group_by(value_cut) %>% mutate(f = n/sum(n)) %>% ungroup() %>% 
  ggplot(aes(x = value_cut, y = f, fill = is_same)) + geom_col() + coord_flip()

select Threshold

best_model_results %>% 
arrange(value) %>% 
mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>% 
mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>% 
rename(score:=value,
       score_typ:=name) %>%
pivot_longer(cols = c(f_cumsum_true, f_cumsum_false)) %>%
ggplot(aes(x=score, y=value, color= name)) + geom_line()

Threshold <- 
  best_model_results %>% 
  arrange(value) %>% 
  mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>% 
  mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>%
  arrange(abs(f_cumsum_false - 0.9995)) %>%
  slice(1) %>% pull(value)
Threshold
[1] 8.101735
Threshold2 <- 
  best_model_results %>% 
  arrange(value) %>% 
  mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>% 
  mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>%
  arrange(abs(f_cumsum_true - 0.75)) %>%
  slice(1) %>% pull(value)
Threshold2
[1] 65.64459

Make the Graph of duplicates

Threshold
[1] 8.101735
g <- make_empty_graph(n = 0, directed=FALSE) +
    vertices(paste0("N_",1:nrow(noisy)))


edges_vector <- 
  best_model_results %>% 
  filter(value >= Threshold) %>% 
  #filter(x != y) %>% 
  select(x, y) %>% #head(5) %>% 
  pmap(function(x, y){
    c(x,y)
  }) %>% unlist()

#edges_vector
gorder(g) %>% print()
[1] 429
gsize(g) %>% print()
[1] 0
#g
g<-add_edges(g, edges_vector)
gorder(g) %>% print()
[1] 429
gsize(g) %>% print()
[1] 3517
g
IGRAPH 2ee511f UN-- 429 3517 -- 
+ attr: name (v/c)
+ edges from 2ee511f (vertex names):
 [1] N_1--N_1  N_1--N_2  N_1--N_3  N_1--N_4  N_1--N_5  N_1--N_6 
 [7] N_1--N_7  N_1--N_8  N_1--N_9  N_1--N_10 N_1--N_11 N_1--N_2 
[13] N_2--N_2  N_2--N_3  N_2--N_4  N_2--N_5  N_2--N_6  N_2--N_7 
[19] N_2--N_8  N_2--N_9  N_2--N_10 N_2--N_11 N_1--N_3  N_2--N_3 
[25] N_3--N_3  N_3--N_4  N_3--N_5  N_3--N_6  N_3--N_7  N_3--N_8 
[31] N_3--N_9  N_3--N_10 N_3--N_11 N_1--N_4  N_2--N_4  N_3--N_4 
[37] N_4--N_4  N_4--N_5  N_4--N_6  N_4--N_7  N_4--N_8  N_4--N_9 
[43] N_4--N_10 N_4--N_11 N_1--N_5  N_2--N_5  N_3--N_5  N_4--N_5 
+ ... omitted several edges

Get disconected components of graph

g_s <- decompose.graph(g)
gi <- 
  g_s %>%
  sample(x = ., size=1) %>% 
  extract2(1) 
print(length(g_s))
[1] 67
print(gi)
IGRAPH 2f38396 UN-- 7 56 -- 
+ attr: name (v/c)
+ edges from 2f38396 (vertex names):
 [1] N_100--N_100 N_100--N_100 N_100--N_101 N_100--N_101 N_101--N_101
 [6] N_101--N_101 N_100--N_102 N_100--N_102 N_101--N_102 N_101--N_102
[11] N_102--N_102 N_102--N_102 N_100--N_103 N_100--N_103 N_101--N_103
[16] N_101--N_103 N_102--N_103 N_102--N_103 N_103--N_103 N_103--N_103
[21] N_100--N_104 N_100--N_104 N_101--N_104 N_101--N_104 N_102--N_104
[26] N_102--N_104 N_103--N_104 N_103--N_104 N_104--N_104 N_104--N_104
[31] N_100--N_105 N_100--N_105 N_101--N_105 N_101--N_105 N_102--N_105
[36] N_102--N_105 N_103--N_105 N_103--N_105 N_104--N_105 N_104--N_105
+ ... omitted several edges
gwithn <- tibble(g=as.character(), n_nm=as.character(), n = as.integer())

for (i in 1:length(g_s)){
  g_nm = paste0("g_",i)
  #print(g_nm)
  g_i <- as_ids(V(g_s[[i]]))
  for (i_n in g_i){
    gwithn <- 
      gwithn %>% add_row(g = g_nm, n_nm = i_n, n = as.integer(gsub(x=i_n, pattern="N_", replacement="")))
    #print()
  }
  
}
gwithn <- gwithn %>% distinct()
gwithn
# A tibble: 429 x 3
   g     n_nm      n
   <chr> <chr> <int>
 1 g_1   N_1       1
 2 g_1   N_2       2
 3 g_1   N_3       3
 4 g_1   N_4       4
 5 g_1   N_5       5
 6 g_1   N_6       6
 7 g_1   N_7       7
 8 g_1   N_8       8
 9 g_1   N_9       9
10 g_1   N_10     10
# ... with 419 more rows

Above g is a group of records that are a person and n_nmis a record. For the fully disconnected graph we have predicted there to be 67 distinct people in 429 records. when there are in fact nrow(entities) people in the origional dataset

based on community ratther then fully disconnnected

gwithn <- tibble(g=as.character(), n_nm=as.character(), n = as.integer())

g_c <- fastgreedy.community(simplify(as.undirected(g)))
walk(1:length(g_c), function(i_c){
  ig <- g_c[[i_c]]
  #print(i_c)
  g_nm = paste0("g_", i_c)
  walk(1:length(ig), function(i_n){
    #print(ig[[i_n]])
    n_nm <- ig[[i_n]]
    gwithn <<- 
      gwithn %>% add_row(g = g_nm, 
                         n_nm = n_nm, 
                         n = as.integer(gsub(x=n_nm, pattern="N_", replacement=""))
                         )
  })
})
gwithn
# A tibble: 429 x 3
   g     n_nm      n
   <chr> <chr> <int>
 1 g_1   N_36     36
 2 g_1   N_37     37
 3 g_1   N_38     38
 4 g_1   N_39     39
 5 g_1   N_40     40
 6 g_1   N_41     41
 7 g_1   N_42     42
 8 g_1   N_43     43
 9 g_1   N_44     44
10 g_1   N_45     45
# ... with 419 more rows

For the community algorithm 67 distinct people in 429 records. when there are in fact nrow(entities) people in the origional dataset

plot one random component

plot(gi)

Plot Whole Graph

plot(g)

Extract df of sub_graph with Node

So now we will summarize the data for later in a data frame.

most_common_names_concat <- function(x, sep = "; ", n = 2){
    x = as.character(x)
    n2 = min(n, length(x))
    tmp <- sort(table(x),decreasing=T)[1:n2] 
    paste0(names(tmp), "(",tmp, ")") %>%  paste0(collapse=sep)
}
most_common_names_vector <- function(x, n = 2){
  n2 = min(n, length(x))
  tmp <- sort(table(x),decreasing=T)[1:n2] 
  tmp2 <- as_tibble(tmp) 
  tmp_l <- as.list(tmp2$n)
  names(tmp_l) <- tmp2$x
  tmp_l
}
unique_list <- function(x){
  x %>% table() %>% sort(decreasing=T) %>% names() %>% list()
}
concat_vect_remove <- function(x, sep = "; "){
  x_u <- x %>% unique()
  x_u <- x_u[!is.na(x_u)]
  paste0(x_u, collapse=sep)
}
count_unique <-function(x){
  x_u <- x %>% unique() 
  x_u <- x_u[!is.na(x_u)]
  length(x_u)
}

Summarize

Now we know all the miss-spellings of each persons names.

de_noised <- 
  noisy %>%
  mutate(., n = 1:nrow(.)) %>% 
  left_join(gwithn, by = "n") %>%
  group_by(g) %>%
  summarise(n = n(), 
           first_name = unique_list(first_name),
           last_name = unique_list(last_name),
           city = unique_list(city),
           dob = unique_list(dob),
           key_n = n_distinct(key),
           key = unique_list(key)#,
           #key = most_common_names_concat(key)
           ) %>% ungroup()%>%
            arrange(desc(key_n)) 

de_noised
# A tibble: 67 x 8
   g         n first_name last_name  city      dob      key_n key     
   <chr> <int> <list>     <list>     <list>    <list>   <int> <list>  
 1 g_11     30 <chr [8]>  <chr [11]> <chr [16~ <chr [8~     4 <chr [4~
 2 g_27     27 <chr [10]> <chr [10]> <chr [12~ <chr [6~     4 <chr [4~
 3 g_31     11 <chr [1]>  <chr [3]>  <chr [7]> <chr [4~     2 <chr [2~
 4 g_1      12 <chr [3]>  <chr [8]>  <chr [5]> <chr [4~     1 <chr [1~
 5 g_10     11 <chr [4]>  <chr [3]>  <chr [5]> <chr [1~     1 <chr [1~
 6 g_12      9 <chr [3]>  <chr [4]>  <chr [4]> <chr [1~     1 <chr [1~
 7 g_13      9 <chr [4]>  <chr [2]>  <chr [2]> <chr [1~     1 <chr [1~
 8 g_14      9 <chr [5]>  <chr [4]>  <chr [7]> <chr [2~     1 <chr [1~
 9 g_15      9 <chr [4]>  <chr [6]>  <chr [1]> <chr [4~     1 <chr [1~
10 g_16      9 <chr [5]>  <chr [1]>  <chr [1]> <chr [2~     1 <chr [1~
# ... with 57 more rows

Conclusion

Make a form with validation checks! … Please…

sessionInfo

print(R.version.string)
[1] "R version 4.0.4 (2021-02-15)"
sessionInfo(package = NULL)
R version 4.0.4 (2021-02-15)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18363)

Matrix products: default

locale:
[1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
[3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
[5] LC_TIME=English_Canada.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
 [1] snakecase_0.11.0 lemon_0.4.5      igraph_1.2.6    
 [4] magrittr_2.0.1   AUC_0.3.0        plotROC_2.2.1   
 [7] phonics_1.3.9    reclin_0.1.1     ldat_0.3.3      
[10] Rcpp_1.0.6       lvec_0.2.2       lexicon_1.2.1   
[13] babynames_1.0.1  fuzzyjoin_0.1.6  lubridate_1.7.10
[16] janitor_2.1.0    forcats_0.5.1    stringr_1.4.0   
[19] dplyr_1.0.5      purrr_0.3.4      readr_1.4.0     
[22] tidyr_1.1.3      tibble_3.1.0     ggplot2_3.3.3   
[25] tidyverse_1.3.0  knitr_1.31      

loaded via a namespace (and not attached):
 [1] fs_1.5.0           httr_1.4.2         tools_4.0.4       
 [4] backports_1.2.1    bslib_0.2.4        utf8_1.2.1        
 [7] R6_2.5.0           DBI_1.1.1          colorspace_2.0-0  
[10] withr_2.4.1        tidyselect_1.1.0   gridExtra_2.3     
[13] downlit_0.2.1      compiler_4.0.4     cli_2.4.0         
[16] rvest_1.0.0        xml2_1.3.2         labeling_0.4.2    
[19] sass_0.3.1         scales_1.1.1       digest_0.6.27     
[22] rmarkdown_2.7      stringdist_0.9.6.3 pkgconfig_2.0.3   
[25] htmltools_0.5.1.1  dbplyr_2.1.0       highr_0.8         
[28] maps_3.3.0         rlang_0.4.10       readxl_1.3.1      
[31] rstudioapi_0.13    jquerylib_0.1.3    generics_0.1.0    
[34] farver_2.1.0       jsonlite_1.7.2     distill_1.2       
[37] munsell_0.5.0      fansi_0.4.2        lifecycle_1.0.0   
[40] stringi_1.5.3      yaml_2.2.1         plyr_1.8.6        
[43] grid_4.0.4         parallel_4.0.4     crayon_1.4.1      
[46] lattice_0.20-41    haven_2.3.1        hms_1.0.0         
[49] pillar_1.5.1       lpSolve_5.6.15     reprex_1.0.0      
[52] glue_1.4.2         evaluate_0.14      data.table_1.14.0 
[55] modelr_0.1.8       vctrs_0.3.7        cellranger_1.1.0  
[58] gtable_0.3.0       assertthat_0.2.1   xfun_0.22         
[61] syuzhet_1.0.6      broom_0.7.5        ellipsis_0.3.1